In [1]:
import folium
import pandas as pd
import numpy as np
from folium.plugins import MarkerCluster
import matplotlib.pyplot as plt
import seaborn as sns
# Load datasets
clusters_df = pd.read_csv('latitude_longitude_with_clusters.csv')
city_data = pd.read_csv('final_merged_data_with_weather.csv')
# Function to find nearest city data
def find_nearest_city(lat, lon, city_df):
distances = ((city_df['latitude'] - lat)**2 + (city_df['longitude'] - lon)**2)**0.5
return city_df.iloc[distances.idxmin()]
# Calculate seasonal metrics
for season in ['winter', 'spring', 'summer', 'fall']:
if season == 'winter':
months = [12, 1, 2]
elif season == 'spring':
months = [3, 4, 5]
elif season == 'summer':
months = [6, 7, 8]
else: # fall
months = [9, 10, 11]
# Temperature
temp_cols = [f'temp_month_{m}' for m in months]
city_data[f'{season}_temp'] = city_data[temp_cols].mean(axis=1)
# Precipitation
precip_cols = [f'precip_month_{m}' for m in months]
city_data[f'{season}_precip'] = city_data[precip_cols].mean(axis=1)
# Create map
m = folium.Map(location=[37.5, -98.0], zoom_start=5)
marker_cluster = MarkerCluster().add_to(m)
# Add markers with data
for _, uni in clusters_df.iterrows():
city_data_point = find_nearest_city(uni['LATITUDE'], uni['LONGITUDE'], city_data)
adjusted_cost = (city_data_point['Cost of Living Index'] / 100) * 77280
cost_of_food = adjusted_cost*.161
cost_of_housing = adjusted_cost*.232
cost_of_utils = adjusted_cost*.101
cost_of_transportation = adjusted_cost*.186
cost_of_healthcare = adjusted_cost*.096
popup_content = f"""
<div style='width: 300px'>
<h4>Cluster {uni['Clusters']}</h4>
<b>Location:</b> {city_data_point['City']}, {city_data_point['State']}<br>
<b>Economic Profile:</b><br>
Cost of Living Index: {city_data_point['Cost of Living Index']:.1f}<br>
Adjusted Cost of Living per Year(77,280 base): ${adjusted_cost:,.2f}<br>
Average Food Expenses: ${cost_of_food:,.2f}<br>
Average Housing Expenses: ${cost_of_housing:,.2f}<br>
Average Utilities Expenses: ${cost_of_utils:,.2f}<br>
Average Transportation Expenses: ${cost_of_transportation:,.2f}<br>
Average Healthcare Expenses: ${cost_of_healthcare:,.2f}<br>
Average Income: ${city_data_point['Average Income']:,.0f}<br>
<b>Campus Safety:</b><br>
Reported Crimes: {city_data_point['total_crime']}<br>
<b>Climate:</b><br>
<i>Temperature (°C):</i><br>
Winter: {city_data_point['winter_temp']:.1f}<br>
Spring: {city_data_point['spring_temp']:.1f}<br>
Summer: {city_data_point['summer_temp']:.1f}<br>
Fall: {city_data_point['fall_temp']:.1f}<br>
<i>Precipitation (mm):</i><br>
Winter: {city_data_point['winter_precip']:.1f}<br>
Spring: {city_data_point['spring_precip']:.1f}<br>
Summer: {city_data_point['summer_precip']:.1f}<br>
Fall: {city_data_point['fall_precip']:.1f}
</div>
"""
# Color based on crime reports
if city_data_point['total_crime'] < 5:
color = 'green' # safe campus
elif city_data_point['total_crime'] > 10:
color = 'red' # high crime
else:
color = 'orange' # moderate
folium.Marker(
location=[uni['LATITUDE'], uni['LONGITUDE']],
popup=popup_content,
icon=folium.Icon(color=color)
).add_to(marker_cluster)
# Create visualization plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
# Calculate cluster statistics for plotting
cluster_stats = {}
for cluster_id in clusters_df['Clusters'].unique():
cluster_unis = clusters_df[clusters_df['Clusters'] == cluster_id]
cluster_data = pd.DataFrame([find_nearest_city(row['LATITUDE'], row['LONGITUDE'], city_data)
for _, row in cluster_unis.iterrows()])
cluster_stats[cluster_id] = {
'avg_cost': cluster_data['Cost of Living Index'].mean(),
'avg_income': cluster_data['Average Income'].mean(),
'avg_crime': cluster_data['total_crime'].mean(),
'winter_temp': cluster_data['winter_temp'].mean(),
'spring_temp': cluster_data['spring_temp'].mean(),
'summer_temp': cluster_data['summer_temp'].mean(),
'fall_temp': cluster_data['fall_temp'].mean(),
'winter_precip': cluster_data['winter_precip'].mean(),
'spring_precip': cluster_data['spring_precip'].mean(),
'summer_precip': cluster_data['summer_precip'].mean(),
'fall_precip': cluster_data['fall_precip'].mean()
}
cluster_df = pd.DataFrame(cluster_stats).T
# 1. Seasonal Temperature Distribution
# Rename the columns for clearer plot labels
temp_data = cluster_df[['winter_temp', 'spring_temp', 'summer_temp', 'fall_temp']].rename(
columns={
'winter_temp': 'Winter',
'spring_temp': 'Spring',
'summer_temp': 'Summer',
'fall_temp': 'Fall'
}
)
sns.boxplot(data=temp_data, ax=ax1)
ax1.set_title('Seasonal Temperature Distribution by Cluster')
ax1.set_ylabel('Temperature (°C)')
# 2. Seasonal Precipitation Distribution
# Rename the columns for clearer plot labels
precip_data = cluster_df[['winter_precip', 'spring_precip', 'summer_precip', 'fall_precip']].rename(
columns={
'winter_precip': 'Winter',
'spring_precip': 'Spring',
'summer_precip': 'Summer',
'fall_precip': 'Fall'
}
)
sns.boxplot(data=precip_data, ax=ax2)
ax2.set_title('Seasonal Precipitation Distribution by Cluster')
ax2.set_ylabel('Precipitation (mm)')
# 3. Cost vs Safety Scatter
sns.scatterplot(data=cluster_df, x='avg_cost', y='avg_crime', ax=ax3)
ax3.set_title('Average Cost of Living vs Campus Crime')
ax3.set_xlabel('Cost of Living Index')
ax3.set_ylabel('Average Reported Crimes')
# 4. Cost vs Income Distribution
sns.scatterplot(data=cluster_df, x='avg_cost', y='avg_income', ax=ax4)
ax4.set_title('Cost of Living vs Average Income')
ax4.set_xlabel('Cost of Living Index')
ax4.set_ylabel('Average Income ($)')
plt.tight_layout()
plt.show()
m.save('cluster_visualization.html')
# After calculating cluster_stats, but before the print statements, add this code to create the profiles dictionary:
profiles = {}
for cluster_id in clusters_df['Clusters'].unique():
cluster_unis = clusters_df[clusters_df['Clusters'] == cluster_id]
cluster_data = pd.DataFrame([find_nearest_city(row['LATITUDE'], row['LONGITUDE'], city_data)
for _, row in cluster_unis.iterrows()])
profiles[cluster_id] = {
'size': {
'num_universities': len(cluster_unis),
'major_cities': ', '.join(cluster_data['City'].unique()[:3]) # List up to 3 major cities
},
'economics': {
'avg_cost_index': cluster_data['Cost of Living Index'].mean(),
'cost_range': (cluster_data['Cost of Living Index'].min(),
cluster_data['Cost of Living Index'].max()),
'avg_income': cluster_data['Average Income'].mean(),
'income_range': (cluster_data['Average Income'].min(),
cluster_data['Average Income'].max())
},
'safety': {
'avg_reported_crimes': cluster_data['total_crime'].mean(),
'total_reported_crimes': cluster_data['total_crime'].sum(),
'num_safe_campuses': len(cluster_data[cluster_data['total_crime'] < 5]),
'num_high_crime_campuses': len(cluster_data[cluster_data['total_crime'] > 10])
},
'climate': {
'temperature': {
'winter': cluster_data['winter_temp'].mean(),
'spring': cluster_data['spring_temp'].mean(),
'summer': cluster_data['summer_temp'].mean(),
'fall': cluster_data['fall_temp'].mean()
},
'precipitation': {
'winter': cluster_data['winter_precip'].mean(),
'spring': cluster_data['spring_precip'].mean(),
'summer': cluster_data['summer_precip'].mean(),
'fall': cluster_data['fall_precip'].mean(),
'annual': cluster_data[['winter_precip', 'spring_precip',
'summer_precip', 'fall_precip']].mean(axis=1).mean()
}
}
}
In [2]:
m
Out[2]:
Make this Notebook Trusted to load map: File -> Trust Notebook